Data Inspection¶
In [1]:
import plotly
plotly.offline.init_notebook_mode()
In [2]:
import pandas as pd
from tqdm import tqdm
df_list = list()
chunk_iter = pd.read_csv(
"../data/Total_Data_10Y_Top24.csv",
chunksize=100000,
dtype = {
"CANCELLATION_CODE": str,
}
)
for chunk in tqdm(chunk_iter):
df_list.append(chunk)
df = pd.concat(df_list)
df.head()
130it [00:28, 4.57it/s]
Out[2]:
| FL_DATE | OP_UNIQUE_CARRIER | TAIL_NUM | ORIGIN_AIRPORT_SEQ_ID | ORIGIN_CITY_MARKET_ID | ORIGIN | ORIGIN_CITY_NAME | DEST_AIRPORT_SEQ_ID | DEST_CITY_MARKET_ID | DEST | ... | TAXI_IN | ARR_TIME | ARR_DELAY | CANCELLED | CANCELLATION_CODE | CARRIER_DELAY | WEATHER_DELAY | NAS_DELAY | SECURITY_DELAY | LATE_AIRCRAFT_DELAY | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2014-07-01 | AA | N002AA | 1105703 | 31057 | CLT | Charlotte, NC | 1129803 | 30194 | DFW | ... | 28.0 | 1214.0 | 9.0 | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN |
| 1 | 2014-07-01 | AA | N002AA | 1129803 | 30194 | DFW | Dallas/Fort Worth, TX | 1105703 | 31057 | CLT | ... | 13.0 | 945.0 | 0.0 | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN |
| 2 | 2014-07-01 | AA | N004AA | 1039705 | 30397 | ATL | Atlanta, GA | 1129803 | 30194 | DFW | ... | 6.0 | 1341.0 | -9.0 | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN |
| 3 | 2014-07-01 | AA | N004AA | 1129803 | 30194 | DFW | Dallas/Fort Worth, TX | 1039705 | 30397 | ATL | ... | 7.0 | 1159.0 | 4.0 | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN |
| 4 | 2014-07-01 | AA | N004AA | 1129803 | 30194 | DFW | Dallas/Fort Worth, TX | 1039705 | 30397 | ATL | ... | 6.0 | 2317.0 | 2.0 | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN |
5 rows × 24 columns
In [3]:
df.describe()
Out[3]:
| ORIGIN_AIRPORT_SEQ_ID | ORIGIN_CITY_MARKET_ID | DEST_AIRPORT_SEQ_ID | DEST_CITY_MARKET_ID | DEP_TIME | DEP_DELAY | TAXI_OUT | TAXI_IN | ARR_TIME | ARR_DELAY | CANCELLED | CARRIER_DELAY | WEATHER_DELAY | NAS_DELAY | SECURITY_DELAY | LATE_AIRCRAFT_DELAY | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 1.292656e+07 | 1.292656e+07 | 1.292656e+07 | 1.292656e+07 | 1.271494e+07 | 1.271486e+07 | 1.271048e+07 | 1.270744e+07 | 1.270744e+07 | 1.268139e+07 | 1.292656e+07 | 2.559603e+06 | 2.559603e+06 | 2.559603e+06 | 2.559603e+06 | 2.559603e+06 |
| mean | 1.298824e+06 | 3.161338e+04 | 1.298812e+06 | 3.161370e+04 | 1.332409e+03 | 1.127152e+01 | 1.722694e+01 | 8.678440e+00 | 1.471801e+03 | 5.409690e+00 | 1.675551e-02 | 2.041558e+01 | 2.650336e+00 | 1.520350e+01 | 1.268451e-01 | 2.413592e+01 |
| std | 1.453419e+05 | 1.168922e+03 | 1.453231e+05 | 1.168763e+03 | 5.192380e+02 | 4.514114e+01 | 9.141093e+00 | 6.704624e+00 | 5.554134e+02 | 4.738376e+01 | 1.283540e-01 | 5.935956e+01 | 2.208897e+01 | 3.191662e+01 | 3.210375e+00 | 5.163883e+01 |
| min | 1.039705e+06 | 3.019400e+04 | 1.039705e+06 | 3.019400e+04 | 1.000000e+00 | -2.340000e+02 | 0.000000e+00 | 1.000000e+00 | 1.000000e+00 | -2.380000e+02 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 |
| 25% | 1.129806e+06 | 3.046600e+04 | 1.129806e+06 | 3.046600e+04 | 9.060000e+02 | -5.000000e+00 | 1.200000e+01 | 5.000000e+00 | 1.052000e+03 | -1.400000e+01 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 |
| 50% | 1.289208e+06 | 3.145400e+04 | 1.289208e+06 | 3.145400e+04 | 1.324000e+03 | -1.000000e+00 | 1.500000e+01 | 7.000000e+00 | 1.515000e+03 | -5.000000e+00 | 0.000000e+00 | 3.000000e+00 | 0.000000e+00 | 2.000000e+00 | 0.000000e+00 | 0.000000e+00 |
| 75% | 1.410702e+06 | 3.245700e+04 | 1.410702e+06 | 3.245700e+04 | 1.751000e+03 | 9.000000e+00 | 2.000000e+01 | 1.000000e+01 | 1.930000e+03 | 9.000000e+00 | 0.000000e+00 | 2.000000e+01 | 0.000000e+00 | 1.900000e+01 | 0.000000e+00 | 2.800000e+01 |
| max | 1.501606e+06 | 3.481900e+04 | 1.501606e+06 | 3.481900e+04 | 2.400000e+03 | 3.695000e+03 | 2.270000e+02 | 1.419000e+03 | 2.400000e+03 | 3.680000e+03 | 1.000000e+00 | 3.359000e+03 | 2.692000e+03 | 1.511000e+03 | 9.870000e+02 | 3.581000e+03 |
In [4]:
df.FL_DATE.max()
Out[4]:
'2024-06-30'
In [5]:
df.nunique()
Out[5]:
FL_DATE 3653 OP_UNIQUE_CARRIER 20 TAIL_NUM 9140 ORIGIN_AIRPORT_SEQ_ID 58 ORIGIN_CITY_MARKET_ID 20 ORIGIN 24 ORIGIN_CITY_NAME 24 DEST_AIRPORT_SEQ_ID 58 DEST_CITY_MARKET_ID 20 DEST 24 DEST_CITY_NAME 24 DEP_TIME 1440 DEP_DELAY 1783 TAXI_OUT 195 TAXI_IN 247 ARR_TIME 1440 ARR_DELAY 1806 CANCELLED 2 CANCELLATION_CODE 4 CARRIER_DELAY 1558 WEATHER_DELAY 971 NAS_DELAY 855 SECURITY_DELAY 268 LATE_AIRCRAFT_DELAY 1228 dtype: int64
In [6]:
df.dtypes
Out[6]:
FL_DATE object OP_UNIQUE_CARRIER object TAIL_NUM object ORIGIN_AIRPORT_SEQ_ID int64 ORIGIN_CITY_MARKET_ID int64 ORIGIN object ORIGIN_CITY_NAME object DEST_AIRPORT_SEQ_ID int64 DEST_CITY_MARKET_ID int64 DEST object DEST_CITY_NAME object DEP_TIME float64 DEP_DELAY float64 TAXI_OUT float64 TAXI_IN float64 ARR_TIME float64 ARR_DELAY float64 CANCELLED float64 CANCELLATION_CODE object CARRIER_DELAY float64 WEATHER_DELAY float64 NAS_DELAY float64 SECURITY_DELAY float64 LATE_AIRCRAFT_DELAY float64 dtype: object
Data Cleaning¶
In [7]:
MEDIUM_AIRPORT_CODE = [
"DAL",
"PDX",
"STL",
"RDU",
"HOU",
"SMF",
"MSY",
"SJC",
"SJU",
"SNA"
]
In [8]:
import matplotlib.pyplot as plt
from datetime import time
target_df = df.drop(columns = [
"OP_UNIQUE_CARRIER",
"TAIL_NUM",
"ORIGIN_AIRPORT_SEQ_ID",
"DEST_AIRPORT_SEQ_ID",
"TAXI_OUT",
"TAXI_IN",
])
del df
In [9]:
import numpy as np
target_df['FL_DATE'] = pd.to_datetime(target_df['FL_DATE'])
target_df['ARR_TIME'] = target_df['ARR_TIME'].apply(
lambda x: x if np.isnan(x) else time(hour=int(x)//100%24, minute=int(x)%100)
)
target_df['DEP_TIME'] = target_df['DEP_TIME'].apply(
lambda x: x if np.isnan(x) else time(hour=int(x)//100%24, minute=int(x)%100)
)
In [10]:
airport_set_df = target_df.ORIGIN.drop_duplicates()
airport_review = pd.read_csv("../supplementary/Airport-Reviews-Table.csv")
airport_review = airport_review[airport_review.AIRPORT_CODE.isin(airport_set_df)]
airport_default_score = airport_review.groupby("AIRPORT_CODE")[[
'ratingValue',
'queueTime',
'terminalCleanliness',
'terminalSeating',
'terminalSign',
'foodBeverage',
'airportShopping',
'wifiConnectivity',
'airportStaff',
'sentiment'
]].mean().reset_index()
airport_review_count = airport_review.groupby("AIRPORT_CODE").size().reset_index().rename(columns = {0:"reviewCount"})
airport_review_merged = pd.merge(airport_review_count, airport_default_score, how = "inner", on = "AIRPORT_CODE")
airport_review_merged['Large'] = ~airport_review_merged.AIRPORT_CODE.isin(MEDIUM_AIRPORT_CODE)
airport_review_merged.head()
Out[10]:
| AIRPORT_CODE | reviewCount | ratingValue | queueTime | terminalCleanliness | terminalSeating | terminalSign | foodBeverage | airportShopping | wifiConnectivity | airportStaff | sentiment | Large | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | ATL | 333 | 0.256456 | 1.939940 | 2.750751 | 2.390511 | 2.623853 | 1.784195 | 2.450777 | 2.613333 | 1.754902 | -0.741742 | True |
| 1 | CLT | 169 | 0.242012 | 1.958580 | 2.349112 | 2.019481 | 2.625767 | 1.951220 | 2.468468 | 2.100917 | 1.947020 | -0.715976 | True |
| 2 | DAL | 26 | 0.450000 | 3.173913 | 3.695652 | 3.150000 | 3.478261 | 3.304348 | 3.090909 | 3.714286 | 2.875000 | -0.153846 | False |
| 3 | DEN | 151 | 0.260265 | 1.847682 | 2.536424 | 2.253731 | 2.213333 | 2.006757 | 2.308511 | 2.557895 | 1.934307 | -0.761589 | True |
| 4 | DFW | 175 | 0.321143 | 2.298851 | 2.908046 | 2.771429 | 2.812865 | 2.179191 | 2.793478 | 2.647619 | 2.102564 | -0.531429 | True |
In [11]:
target_airport_df = pd.merge(
target_df,
airport_review_merged,
how = "inner",
left_on = "ORIGIN",
right_on = "AIRPORT_CODE",
suffixes = ["", "_origin"]
)
target_airport_df = pd.merge(
target_airport_df,
airport_review_merged,
how = "inner",
left_on = "DEST",
right_on = "AIRPORT_CODE",
suffixes = ["", "_dest"]
)
target_airport_df.rename(columns = {
'reviewCount':'Origin Review Count',
'ratingValue':'Origin Rating Value',
'queueTime':'Origin Queue Time',
'terminalCleanliness':'Origin Terminal Cleanliness',
'terminalSeating':'Origin Terminal Seating',
'terminalSign':'Origin Terminal Sign',
'foodBeverage':'Origin Food Beverage',
'airportShopping':'Origin Airport Shopping',
'wifiConnectivity':'Origin Wifi Connectivity',
'airportStaff':'Origin Airport Staff',
'sentiment':'Origin Sentiment',
'Large':'Origin Busy',
'reviewCount_dest':'Destination Review Count',
'ratingValue_dest':'Destination Rating Value',
'queueTime_dest':'Destination Queue Time',
'terminalCleanliness_dest':'Destination Terminal Cleanliness',
'terminalSeating_dest':'Destination Terminal Seating',
'terminalSign_dest':'Destination Terminal Sign',
'foodBeverage_dest':'Destination Food Beverage',
'airportShopping_dest':'Destination Airport Shopping',
'wifiConnectivity_dest':'Destination Wifi Connectivity',
'airportStaff_dest':'Destination Airport Staff',
'sentiment_dest':'Destination Sentiment',
'Large_dest':'Destination Busy'
}, inplace = True)
target_airport_df.drop(columns = ['AIRPORT_CODE', 'AIRPORT_CODE_dest'], inplace = True)
target_airport_df['CANCELLATION_CODE'] = target_airport_df['CANCELLATION_CODE'].fillna("Not")
target_airport_df.head()
Out[11]:
| FL_DATE | ORIGIN_CITY_MARKET_ID | ORIGIN | ORIGIN_CITY_NAME | DEST_CITY_MARKET_ID | DEST | DEST_CITY_NAME | DEP_TIME | DEP_DELAY | ARR_TIME | ... | Destination Queue Time | Destination Terminal Cleanliness | Destination Terminal Seating | Destination Terminal Sign | Destination Food Beverage | Destination Airport Shopping | Destination Wifi Connectivity | Destination Airport Staff | Destination Sentiment | Destination Busy | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2014-07-01 | 31057 | CLT | Charlotte, NC | 30194 | DFW | Dallas/Fort Worth, TX | 10:25:00 | -5.0 | 12:14:00 | ... | 2.298851 | 2.908046 | 2.771429 | 2.812865 | 2.179191 | 2.793478 | 2.647619 | 2.102564 | -0.531429 | True |
| 1 | 2014-07-01 | 30194 | DFW | Dallas/Fort Worth, TX | 31057 | CLT | Charlotte, NC | 06:21:00 | -4.0 | 09:45:00 | ... | 1.958580 | 2.349112 | 2.019481 | 2.625767 | 1.951220 | 2.468468 | 2.100917 | 1.947020 | -0.715976 | True |
| 2 | 2014-07-01 | 30397 | ATL | Atlanta, GA | 30194 | DFW | Dallas/Fort Worth, TX | 12:38:00 | -2.0 | 13:41:00 | ... | 2.298851 | 2.908046 | 2.771429 | 2.812865 | 2.179191 | 2.793478 | 2.647619 | 2.102564 | -0.531429 | True |
| 3 | 2014-07-01 | 30194 | DFW | Dallas/Fort Worth, TX | 30397 | ATL | Atlanta, GA | 09:04:00 | 14.0 | 11:59:00 | ... | 1.939940 | 2.750751 | 2.390511 | 2.623853 | 1.784195 | 2.450777 | 2.613333 | 1.754902 | -0.741742 | True |
| 4 | 2014-07-01 | 30194 | DFW | Dallas/Fort Worth, TX | 30397 | ATL | Atlanta, GA | 20:16:00 | 1.0 | 23:17:00 | ... | 1.939940 | 2.750751 | 2.390511 | 2.623853 | 1.784195 | 2.450777 | 2.613333 | 1.754902 | -0.741742 | True |
5 rows × 42 columns
In [12]:
target_airport_df
Out[12]:
| FL_DATE | ORIGIN_CITY_MARKET_ID | ORIGIN | ORIGIN_CITY_NAME | DEST_CITY_MARKET_ID | DEST | DEST_CITY_NAME | DEP_TIME | DEP_DELAY | ARR_TIME | ... | Destination Queue Time | Destination Terminal Cleanliness | Destination Terminal Seating | Destination Terminal Sign | Destination Food Beverage | Destination Airport Shopping | Destination Wifi Connectivity | Destination Airport Staff | Destination Sentiment | Destination Busy | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2014-07-01 | 31057 | CLT | Charlotte, NC | 30194 | DFW | Dallas/Fort Worth, TX | 10:25:00 | -5.0 | 12:14:00 | ... | 2.298851 | 2.908046 | 2.771429 | 2.812865 | 2.179191 | 2.793478 | 2.647619 | 2.102564 | -0.531429 | True |
| 1 | 2014-07-01 | 30194 | DFW | Dallas/Fort Worth, TX | 31057 | CLT | Charlotte, NC | 06:21:00 | -4.0 | 09:45:00 | ... | 1.958580 | 2.349112 | 2.019481 | 2.625767 | 1.951220 | 2.468468 | 2.100917 | 1.947020 | -0.715976 | True |
| 2 | 2014-07-01 | 30397 | ATL | Atlanta, GA | 30194 | DFW | Dallas/Fort Worth, TX | 12:38:00 | -2.0 | 13:41:00 | ... | 2.298851 | 2.908046 | 2.771429 | 2.812865 | 2.179191 | 2.793478 | 2.647619 | 2.102564 | -0.531429 | True |
| 3 | 2014-07-01 | 30194 | DFW | Dallas/Fort Worth, TX | 30397 | ATL | Atlanta, GA | 09:04:00 | 14.0 | 11:59:00 | ... | 1.939940 | 2.750751 | 2.390511 | 2.623853 | 1.784195 | 2.450777 | 2.613333 | 1.754902 | -0.741742 | True |
| 4 | 2014-07-01 | 30194 | DFW | Dallas/Fort Worth, TX | 30397 | ATL | Atlanta, GA | 20:16:00 | 1.0 | 23:17:00 | ... | 1.939940 | 2.750751 | 2.390511 | 2.623853 | 1.784195 | 2.450777 | 2.613333 | 1.754902 | -0.741742 | True |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 12926551 | 2018-12-31 | 31057 | CLT | Charlotte, NC | 31703 | EWR | Newark, NJ | 10:06:00 | -9.0 | 11:49:00 | ... | 1.778146 | 2.258278 | 1.988806 | 2.344595 | 1.568562 | 1.817352 | 1.980000 | 1.562724 | -0.801325 | True |
| 12926552 | 2018-12-31 | 31703 | EWR | Newark, NJ | 31057 | CLT | Charlotte, NC | 14:48:00 | -7.0 | 16:59:00 | ... | 1.958580 | 2.349112 | 2.019481 | 2.625767 | 1.951220 | 2.468468 | 2.100917 | 1.947020 | -0.715976 | True |
| 12926553 | 2018-12-31 | 31703 | JFK | New York, NY | 30194 | DFW | Dallas/Fort Worth, TX | 18:21:00 | 1.0 | 21:55:00 | ... | 2.298851 | 2.908046 | 2.771429 | 2.812865 | 2.179191 | 2.793478 | 2.647619 | 2.102564 | -0.531429 | True |
| 12926554 | 2018-12-31 | 31703 | JFK | New York, NY | 30977 | ORD | Chicago, IL | 07:54:00 | -5.0 | 10:01:00 | ... | 2.065789 | 2.377193 | 2.107692 | 2.423423 | 1.879464 | 2.303448 | 2.165605 | 1.880383 | -0.666667 | True |
| 12926555 | 2018-12-31 | 30977 | ORD | Chicago, IL | 31703 | JFK | New York, NY | 10:37:00 | -3.0 | 13:52:00 | ... | 1.784689 | 2.327751 | 2.082111 | 2.387019 | 1.429952 | 2.061594 | 2.223776 | 1.635204 | -0.742243 | True |
12926556 rows × 42 columns
Visualization¶
Delay Trend¶
In [12]:
delay_by_year
Out[12]:
| Year | variable | Delay (m) | |
|---|---|---|---|
| 0 | 2014 00 | DEP_DELAY | 10.378665 |
| 1 | 2015 00 | DEP_DELAY | 10.414516 |
| 2 | 2016 00 | DEP_DELAY | 10.474933 |
| 3 | 2017 00 | DEP_DELAY | 10.878647 |
| 4 | 2018 00 | DEP_DELAY | 10.739813 |
| ... | ... | ... | ... |
| 61 | 2020 00 | LATE_AIRCRAFT_DELAY | 16.810856 |
| 62 | 2021 00 | LATE_AIRCRAFT_DELAY | 22.448871 |
| 63 | 2022 00 | LATE_AIRCRAFT_DELAY | 24.445999 |
| 64 | 2023 00 | LATE_AIRCRAFT_DELAY | 27.877123 |
| 65 | 2024 00 | LATE_AIRCRAFT_DELAY | 29.435622 |
66 rows × 3 columns
In [13]:
import plotly.express as px
delay_by_year = target_airport_df[[
'FL_DATE',
"DEP_DELAY",
'CARRIER_DELAY',
'WEATHER_DELAY',
'NAS_DELAY',
'SECURITY_DELAY',
'LATE_AIRCRAFT_DELAY',
]]
delay_by_year['Year'] = delay_by_year['FL_DATE'].dt.strftime("%Y %m")
delay_by_year = delay_by_year.groupby("Year")[[
"DEP_DELAY",
'CARRIER_DELAY',
'WEATHER_DELAY',
'NAS_DELAY',
'SECURITY_DELAY',
'LATE_AIRCRAFT_DELAY',
]].mean().reset_index()
delay_by_year = delay_by_year.melt(id_vars = "Year", value_name = "Delay (m)")
fig = px.area(
delay_by_year,
x="Year",
y="Delay (m)",
color ='variable',
title='Airport Delay Trend in the Past 10Y',
)
fig.update_layout(
height=600,
)
fig.show()
C:\Users\wongh\AppData\Local\Temp\ipykernel_22092\870454222.py:12: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
Airport Review¶
First of all, let's have a look at the Review Data we collected
In [13]:
airport_review_merged.head()
Out[13]:
| AIRPORT_CODE | reviewCount | ratingValue | queueTime | terminalCleanliness | terminalSeating | terminalSign | foodBeverage | airportShopping | wifiConnectivity | airportStaff | sentiment | Large | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | ATL | 333 | 0.256456 | 1.939940 | 2.750751 | 2.390511 | 2.623853 | 1.784195 | 2.450777 | 2.613333 | 1.754902 | -0.741742 | True |
| 1 | CLT | 169 | 0.242012 | 1.958580 | 2.349112 | 2.019481 | 2.625767 | 1.951220 | 2.468468 | 2.100917 | 1.947020 | -0.715976 | True |
| 2 | DAL | 26 | 0.450000 | 3.173913 | 3.695652 | 3.150000 | 3.478261 | 3.304348 | 3.090909 | 3.714286 | 2.875000 | -0.153846 | False |
| 3 | DEN | 151 | 0.260265 | 1.847682 | 2.536424 | 2.253731 | 2.213333 | 2.006757 | 2.308511 | 2.557895 | 1.934307 | -0.761589 | True |
| 4 | DFW | 175 | 0.321143 | 2.298851 | 2.908046 | 2.771429 | 2.812865 | 2.179191 | 2.793478 | 2.647619 | 2.102564 | -0.531429 | True |
In [14]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
corr_matrix = airport_review_merged.loc[:, 'reviewCount':].corr()
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', mask = mask)
plt.title('Airport Review Score Correlation')
plt.show()
Summary
The majority of the score given by users are very correlated,
with the exception of reviewCount of an airport, which inversely correlate with all other attributes.
Airport Ranking¶
In [16]:
import pandas as pd
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
In [61]:
fig = go.Figure()
fig = px.scatter(
data_frame = airport_review_merged,
x='sentiment',
y='ratingValue',
color='ratingValue',
size = 'reviewCount',
color_continuous_scale = 'viridis',
facet_col = "Large",
text='AIRPORT_CODE',
log_y = True
)
fig.update_traces(textposition='top center')
fig.for_each_annotation(
lambda a: a.update(text= "Medium Airport" if a.text == "Large=False" else "Large Airport")
)
fig.update_layout(
height=600,
width=1000,
title_text="Airport Sentiment Visualization",
showlegend=False
)
fig.show()
Review Performance per Airport¶
In [18]:
df_columns = [
'queueTime',
'terminalCleanliness',
'terminalSeating',
'terminalSign',
'foodBeverage',
'airportShopping',
'wifiConnectivity',
'airportStaff',
]
df_better_names = [
'Queue Time',
'Terminal Cleanliness',
'Terminal Seating',
'Terminal Sign',
'Food Beverage',
'Airport Shopping',
'Wifi Connectivity',
'Airport Staff',
]
fig = make_subplots(rows=4, cols=2, subplot_titles=df_better_names)
col = [1, 2]*4
row = [1, 1, 2, 2, 3, 3, 4, 4]
for c, r, column, better_column in zip(col, row, df_columns, df_better_names):
airport_review_merged.sort_values(column, ascending = False, inplace = True)
trace = go.Bar(
x=airport_review_merged['AIRPORT_CODE'],
y=airport_review_merged[column],
marker=dict(
color = airport_review_merged[column],
colorscale='viridis'
)
)
fig.add_trace(
trace,
row=r,
col=c
)
fig.update_layout(
height=1000,
title_text="Individual Scoring Attributes of Airports",
showlegend=False
)
fig.show()
Delay vs Ranking¶
In [19]:
target_airport_df.columns
Out[19]:
Index(['FL_DATE', 'ORIGIN_CITY_MARKET_ID', 'ORIGIN', 'ORIGIN_CITY_NAME',
'DEST_CITY_MARKET_ID', 'DEST', 'DEST_CITY_NAME', 'DEP_TIME',
'DEP_DELAY', 'ARR_TIME', 'ARR_DELAY', 'CANCELLED', 'CANCELLATION_CODE',
'CARRIER_DELAY', 'WEATHER_DELAY', 'NAS_DELAY', 'SECURITY_DELAY',
'LATE_AIRCRAFT_DELAY', 'Origin Review Count', 'Origin Rating Value',
'Origin Queue Time', 'Origin Terminal Cleanliness',
'Origin Terminal Seating', 'Origin Terminal Sign',
'Origin Food Beverage', 'Origin Airport Shopping',
'Origin Wifi Connectivity', 'Origin Airport Staff', 'Origin Sentiment',
'Origin Busy', 'Destination Review Count', 'Destination Rating Value',
'Destination Queue Time', 'Destination Terminal Cleanliness',
'Destination Terminal Seating', 'Destination Terminal Sign',
'Destination Food Beverage', 'Destination Airport Shopping',
'Destination Wifi Connectivity', 'Destination Airport Staff',
'Destination Sentiment', 'Destination Busy'],
dtype='object')
In [30]:
train_columns = [
'Origin Rating Value',
'Origin Queue Time',
'Origin Terminal Cleanliness',
'Origin Terminal Seating',
'Origin Terminal Sign',
'Origin Food Beverage',
'Origin Airport Shopping',
'Origin Wifi Connectivity',
'Origin Airport Staff',
'Origin Sentiment',
'Origin Busy',
'Destination Rating Value',
'Destination Queue Time',
'Destination Terminal Cleanliness',
'Destination Terminal Seating',
'Destination Terminal Sign',
'Destination Food Beverage',
'Destination Airport Shopping',
'Destination Wifi Connectivity',
'Destination Airport Staff',
'Destination Sentiment',
'Destination Busy'
]
label_columns = [
"DEP_DELAY",
"ARR_DELAY",
"CARRIER_DELAY",
"WEATHER_DELAY",
"NAS_DELAY",
"SECURITY_DELAY",
"LATE_AIRCRAFT_DELAY",
]
Delay Data Size >0
In [32]:
cal_df = target_airport_df[['ORIGIN'] + label_columns]
delay = cal_df.groupby('ORIGIN')[label_columns].apply(lambda x: (x > 0).mean()*100).reset_index().sort_values("ARR_DELAY", ascending = False)
big_delay = cal_df.groupby('ORIGIN')[label_columns].apply(lambda x: (x > 60).mean()*100).reset_index().sort_values("ARR_DELAY", ascending = False)
In [53]:
import pandas as pd
import matplotlib.pyplot as plt
df1 = delay
df2 = big_delay
fig, ax = plt.subplots(figsize=(12, 6))
bar_width = 0.35
index = range(len(df1))
bars3 = ax.bar([i for i in index], [100 for _ in range(len(df2['ARR_DELAY']))], bar_width, label='All Flight', color='royalblue')
bars1 = ax.bar(index, df1['ARR_DELAY'], bar_width, label='Arrival Delay > 0', color='pink')
bars2 = ax.bar([i for i in index], df2['ARR_DELAY'], bar_width, label='Arrival Delay > 60', color='red')
# Adding labels, title, and legend
ax.set_xlabel('Airport Code')
ax.set_ylabel('Percentage (%)')
ax.set_title('(%) of Delay Flights by Airport')
ax.set_xticks([i + bar_width / 2 for i in index])
ax.set_xticklabels(df1['ORIGIN'])
ax.legend()
plt.savefig("Delay_by_Airport")
In [13]:
target_airport_df.columns
Out[13]:
Index(['FL_DATE', 'ORIGIN_CITY_MARKET_ID', 'ORIGIN', 'ORIGIN_CITY_NAME',
'DEST_CITY_MARKET_ID', 'DEST', 'DEST_CITY_NAME', 'DEP_DELAY',
'ARR_DELAY', 'CANCELLED', 'CANCELLATION_CODE', 'CARRIER_DELAY',
'WEATHER_DELAY', 'NAS_DELAY', 'SECURITY_DELAY', 'LATE_AIRCRAFT_DELAY',
'Origin Review Count', 'Origin Rating Value', 'Origin Queue Time',
'Origin Terminal Cleanliness', 'Origin Terminal Seating',
'Origin Terminal Sign', 'Origin Food Beverage',
'Origin Airport Shopping', 'Origin Wifi Connectivity',
'Origin Airport Staff', 'Origin Sentiment', 'Origin Busy',
'Destination Review Count', 'Destination Rating Value',
'Destination Queue Time', 'Destination Terminal Cleanliness',
'Destination Terminal Seating', 'Destination Terminal Sign',
'Destination Food Beverage', 'Destination Airport Shopping',
'Destination Wifi Connectivity', 'Destination Airport Staff',
'Destination Sentiment', 'Destination Busy'],
dtype='object')
In [51]:
target_airport_df = target_airport_df[(target_airport_df['FL_DATE'].dt.month >= 5) & (target_airport_df['FL_DATE'].dt.month <= 8)]
target_airport_df.nunique()
Out[51]:
FL_DATE 1230 ORIGIN_CITY_MARKET_ID 20 ORIGIN 24 ORIGIN_CITY_NAME 24 DEST_CITY_MARKET_ID 20 DEST 24 DEST_CITY_NAME 24 DEP_TIME 1440 DEP_DELAY 1513 ARR_TIME 1440 ARR_DELAY 1563 CANCELLED 2 CANCELLATION_CODE 5 CARRIER_DELAY 1317 WEATHER_DELAY 716 NAS_DELAY 684 SECURITY_DELAY 200 LATE_AIRCRAFT_DELAY 1063 Origin Review Count 23 Origin Rating Value 24 Origin Queue Time 24 Origin Terminal Cleanliness 23 Origin Terminal Seating 24 Origin Terminal Sign 24 Origin Food Beverage 24 Origin Airport Shopping 24 Origin Wifi Connectivity 23 Origin Airport Staff 24 Origin Sentiment 23 Origin Busy 2 Destination Review Count 23 Destination Rating Value 24 Destination Queue Time 24 Destination Terminal Cleanliness 23 Destination Terminal Seating 24 Destination Terminal Sign 24 Destination Food Beverage 24 Destination Airport Shopping 24 Destination Wifi Connectivity 23 Destination Airport Staff 24 Destination Sentiment 23 Destination Busy 2 dtype: int64
In [17]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
corr_matrix = target_airport_df.loc[:, train_columns + label_columns].corr()
plt.figure(figsize=(10, 5))
sns.heatmap(corr_matrix.loc[train_columns, label_columns], annot=True, cmap='coolwarm')
plt.title('Airport Review Score Correlation with Delays')
plt.show()
plt.figure(figsize=(10, 5))
label_columns.remove('NAS_DELAY')
sns.heatmap(corr_matrix.loc[train_columns, label_columns], annot=True, cmap='coolwarm')
plt.title('Airport Review Score Correlation with Delays (Without NAS)')
plt.show()
In [40]:
import plotly.express as px
fig = px.box(
target_airport_df.sample(1000000, replace = False).sort_values(["Destination Busy", "NAS_DELAY"]),
x="DEST",
y="NAS_DELAY",
log_y = True,
color = f"Destination Busy",
color_discrete_map = {True: 'red', False: 'blue'}
)
fig.update_layout(
height=400,
title_text=f"Destination Aiport vs NAS Delay",
showlegend=True
)
fig.show()